Load libraries
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(gapminder)
Load data
data(gapminder)
schooldata <- read_csv("~/team/bootcamp/R/schooldata.csv")
## Parsed with column specification:
## cols(
## courseid = col_integer(),
## title = col_character(),
## startdate = col_character(),
## grade = col_double(),
## instructorid = col_integer(),
## instructorfirst = col_character(),
## instructorlast = col_character(),
## instructortitle = col_character(),
## studentid = col_integer(),
## gender = col_character(),
## age = col_integer(),
## admissionyear = col_integer()
## )
Create histogram
hist(schooldata$grade)
You can change the bins with the
breaks paramter
hist(schooldata$grade, breaks=30)
You can also set the scales, axis limits, and titles
hist(schooldata$grade,
breaks=seq(.875, 4.125, .25),
xlim=c(0,4),
main="Grade Distribution",
xlab="Grade",
ylab="Count")
Create a scatterplot of life expectancy on the y-axis and GDP per capita on the x-axis
plot(x=gapminder$gdpPercap[gapminder$year==2002],
y=gapminder$lifeExp[gapminder$year==2002],
log="x")
To save a plot, you need to start a device with a file name, recreate the graphics, and then save it. You can set dimensions, which are in inches. To save the graphic, turn off the use the
dev.off() command
pdf("plot1.pdf", width=6, height=4)
plot(x=gapminder$gdpPercap[gapminder$year==2002],
y=gapminder$lifeExp[gapminder$year==2002],
log="x")
dev.off()
## png
## 2
Let’s recreate all of the base R plots in ggplot
Create a histogram with ggplot
ggplot(schooldata, aes(grade)) +
geom_histogram(bins=20)
Instead of setting the number of bins with
bins, you can set the binwidth
ggplot(schooldata, aes(grade)) +
geom_histogram(binwidth=.25) +
xlim(0, NA) +
xlab("Grade") +
ylab("Count") +
ggtitle("Grade Distribution")
Let’s split the data by gender. ggplot’s
facet handles this much better than base R where you have to set the layout and make each individual plot
ggplot(schooldata, aes(grade)) +
geom_histogram(binwidth=.25) +
xlim(0, NA) +
xlab("Grade") +
ylab("Count") +
ggtitle("Grade Distribution") +
facet_grid(gender~.)
Let’s filter out the missing data and set the order with
dplyr before handing it over to ggplot
schooldata %>%
filter(!is.na(gender)) %>%
mutate(gender = factor(gender,
levels=c("M", "F"),
labels=c("Male", "Female"))) %>%
ggplot(aes(grade)) +
geom_histogram(binwidth=.25) +
xlim(0, NA) +
xlab("Grade") +
ylab("Count") +
ggtitle("Grade Distribution") +
facet_grid(gender~.)
Create a scatterplot of life expectancy on the y-axis and GDP per capita on the x-axis and color by the variable
continent
gapminder %>%
filter(year==2002) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, color=continent)) +
geom_point() +
scale_x_log10()
Save the plot.
ggsave will save the last plot that was made.
ggsave("p1.pdf", width=6, height=4)
library(babynames)
data(babynames)
ggplotCreate initial and initial proportions columns
babynames <- mutate(babynames, initial=substr(name, 1, 1))
initial_props <-
babynames %>%
group_by(year, sex, initial) %>%
summarize(count=sum(n), totalprop=sum(prop)) %>%
ungroup()
Plot girls with the initial A over time
initial_props %>%
filter(sex=="F" & initial=="A") %>%
ggplot(aes(y=totalprop, x=year)) +
geom_line()
Plot boys and girls with the initial
A over time
initial_props %>%
filter(initial=="A") %>%
ggplot(aes(y=totalprop, x=year, color=sex)) +
geom_line()
Change the
linetype instead of color
initial_props %>%
filter(initial=="A") %>%
ggplot(aes(y=totalprop, x=year)) +
geom_line(aes(linetype=sex), color="red", size=1)
Create plot of proportion by year for each initial
initial_props %>%
ggplot(aes(y=totalprop, x=year, color=sex)) +
geom_line() +
facet_wrap(~initial) +
scale_color_discrete(labels=c("Female", "Male"), name="Sex") +
scale_y_continuous(breaks=c(0, .1, .2)) +
theme(axis.text.x = element_text(angle=-90, vjust=0.5),
panel.grid.minor = element_blank()) +
labs(title="Baby Names First Initial",
x="Year",
y="Proportion of Babies")
#### Let’s find unisex names with
dplyr
topnames <- babynames %>%
filter(year>=1950) %>%
group_by(name) %>%
summarize(total=sum(n)) %>%
arrange(desc(total)) %>%
slice(1:1000) %>%
inner_join(babynames) %>%
filter(year >= 1950) %>%
mutate(sex=recode(sex, "F"="Female", "M"="Male")) %>%
group_by(name, sex, total) %>%
summarise(sextotal=sum(n)) %>%
spread(key=sex, value=sextotal) %>%
mutate(ratio=(Male-Female)/total)
## Joining, by = "name"
Create a scatterplot with topnames dataframe
ggplot(topnames, aes(x=Male, y=Female)) +
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
Find row where there is missing data
topnames %>%
filter(is.na(Male) | is.na(Female))
## # A tibble: 1 x 5
## # Groups: name [1]
## name total Female Male ratio
## <chr> <int> <int> <int> <dbl>
## 1 Penelope 37666 37666 NA NA
Set limits to focus on bottom right part of the plot
ggplot(topnames, aes(x=Male, y=Female, color=ratio)) +
geom_point() +
lims(x=c(0, 250000), y=c(0, 250000)) +
geom_abline() +
scale_color_gradient(low="pink", high="blue",
name="Sex Dominance",
breaks=c(.9, 0, -.9),
labels=c("Male", "Neutral", "Female")) +
geom_text(aes(label=ifelse(abs(ratio) < .2,
name,
"")),
hjust=-.25, vjust=0.5, color="gray10",
fontface="bold", size=3)
## Warning: Removed 180 rows containing missing values (geom_point).
## Warning: Removed 180 rows containing missing values (geom_text).
Reconfigure the data to be used with a bar chart. For a bar chart, you need to gather the data and have groups. (This is undoing what we did with
spread when creating topnames)
topnames2 <- gather(topnames, key="sex", value="sextotal", Male, Female)
Create a bar chart for the most common unisex names
topnames2 %>%
filter(abs(ratio) < .3 & !is.na(ratio)) %>%
ggplot(aes(x=name, y=sextotal, fill=sex)) +
geom_bar(stat="identity") +
labs(title="Popular Unisex Names", x="", y="Count")
Show the split instead of count
topnames2 %>%
filter(abs(ratio) < .3 & !is.na(ratio)) %>%
ggplot(aes(x=name, y=sextotal, fill=sex)) +
geom_bar(stat="identity", position="fill") +
geom_hline(yintercept = .5) +
labs(title="Popular Unisex Names", x="", y="Split")
Show a paired bar chart instead of stacked bar chart
topnames2 %>%
filter(abs(ratio) < .3 & !is.na(ratio)) %>%
ggplot(aes(x=name, y=sextotal, fill=sex)) +
geom_bar(stat="identity", position="dodge") +
labs(title="Popular Unisex Names", x="", y="Count")
Load
ggthemes package
library(ggthemes)
Change theme of plot
topnames2 %>%
filter(abs(ratio) < .3 & !is.na(ratio)) %>%
ggplot(aes(x=name, y=sextotal, fill=sex)) +
geom_bar(stat="identity") +
labs(title="Popular Unisex Names", x="", y="Count") +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
initial_props %>%
filter(initial=="A") %>%
ggplot(aes(y=totalprop, x=year, color=sex)) +
geom_line() +
theme_fivethirtyeight()
Add theme after plot and save
myplot <- initial_props %>%
filter(initial=="A") %>%
ggplot(aes(y=totalprop, x=year, color=sex)) +
geom_line()
myplot + theme_calc()
ggsave("plot2.pdf", myplot)
## Saving 7 x 5 in image
plotlyLoad plotly library
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Create column with first two letters of each name and subset A female names
babynames <- mutate(babynames, first2=substr(name, 1, 2))
anames <- babynames %>%
filter(initial=="A" & sex=="F") %>%
group_by(year, first2) %>%
summarize(count=sum(n), totalprop=sum(prop)) %>%
ungroup()
Plot anames proportion by year
p1 <- anames %>%
ggplot(aes(y=totalprop, x=year, color=first2)) +
geom_line()
Use wrapper function to be able to get tooltips with plotly
ggplotly(p1)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
Make a plot directly with plotly
plot_ly(anames, x=~year, y=~totalprop, color=~first2, type="scatter", mode="lines")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Make a plot 3D (poor example due to data, but shows functionality)
plot_ly(anames, x=~year, y=~first2, z=~totalprop, color=~first2, line=list(width=4)) %>%
add_lines() %>%
layout(scene=list(xaxis=list(title="Year"),
yaxis=list(title="Starting Letters"),
zaxis=list(title="Proportion")))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors